#load libraries
library(tidyverse)
library(ggthemes)
library(dplyr)
library(ggmap)
library(maps)
library(caret)
library(class)
library(ggplot2)
library(grid)
library(gridExtra)
library(tidyr)
library(usmap)
library(highcharter)
library(broom)
library(tidyverse)
library(plotly)
library(e1071)
library(RColorBrewer)
#import the data
breweries = read.csv(file.choose(), header = TRUE)
head(breweries, n = 5)
## Brew_ID Name City State
## 1 1 NorthGate Brewing Minneapolis MN
## 2 2 Against the Grain Brewery Louisville KY
## 3 3 Jack's Abby Craft Lagers Framingham MA
## 4 4 Mike Hess Brewing Company San Diego CA
## 5 5 Fort Point Beer Company San Francisco CA
beers = read.csv(file.choose(), header = TRUE)
head(beers, n = 5)
## Name Beer_ID ABV IBU Brewery_id
## 1 Pub Beer 1436 0.050 NA 409
## 2 Devil's Cup 2265 0.066 NA 178
## 3 Rise of the Phoenix 2264 0.071 NA 178
## 4 Sinister 2263 0.090 NA 178
## 5 Sex and Candy 2262 0.075 NA 178
## Style Ounces
## 1 American Pale Lager 12
## 2 American Pale Ale (APA) 12
## 3 American IPA 12
## 4 American Double / Imperial IPA 12
## 5 American IPA 12
#1. How many breweries are present in each state?
# check structure of breweries df
# str(breweries)
## edit the State column to remove the blank space
## also convert to factor
breweries$State = as.factor(substr(breweries$State, 2, 3))
st_reg <- data.frame(State=state.abb, Region=state.region)
st_reg <- rbind(st_reg, data.frame(State="DC", Region="Northeast"))
breweries <- left_join(breweries,st_reg)
## Joining, by = "State"
## confirm all rows have a Region
# breweries %>% filter(is.na(Region) == TRUE)
# Plot breweries by state & region
breweries %>% ggplot(aes(x=State, fill = State)) +
geom_histogram(stat="count") +
labs(title="Count of breweries per state sorted by region") +
theme(legend.position="none", axis.text.x=element_text(angle=45,hjust=0.8)) +
facet_wrap(~Region, scale="free")
The above graph shows the number of breweries per state (broken down by region for more compact viewing).
# Breweries by state
# BreweriesByState = data.frame(table(breweries$State))
# colnames(BreweriesByState) = c("State","Breweries")
# BreweriesByState
# Breweries by state with region
BreweriesByStateRegion = data.frame(table(breweries$State, breweries$Region))
colnames(BreweriesByStateRegion) = c("State","Region","Breweries")
BreweriesByStateRegion %>% filter(Breweries > 0)
## State Region Breweries
## 1 CT Northeast 8
## 2 DC Northeast 1
## 3 MA Northeast 23
## 4 ME Northeast 9
## 5 NH Northeast 3
## 6 NJ Northeast 3
## 7 NY Northeast 16
## 8 PA Northeast 25
## 9 RI Northeast 5
## 10 VT Northeast 10
## 11 AL South 3
## 12 AR South 2
## 13 DE South 2
## 14 FL South 15
## 15 GA South 7
## 16 KY South 4
## 17 LA South 5
## 18 MD South 7
## 19 MS South 2
## 20 NC South 19
## 21 OK South 6
## 22 SC South 4
## 23 TN South 3
## 24 TX South 28
## 25 VA South 16
## 26 WV South 1
## 27 IA North Central 5
## 28 IL North Central 18
## 29 IN North Central 22
## 30 KS North Central 3
## 31 MI North Central 32
## 32 MN North Central 12
## 33 MO North Central 9
## 34 ND North Central 1
## 35 NE North Central 5
## 36 OH North Central 15
## 37 SD North Central 1
## 38 WI North Central 20
## 39 AK West 7
## 40 AZ West 11
## 41 CA West 39
## 42 CO West 47
## 43 HI West 4
## 44 ID West 5
## 45 MT West 9
## 46 NM West 4
## 47 NV West 2
## 48 OR West 29
## 49 UT West 4
## 50 WA West 23
## 51 WY West 4
#2. Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.
# rename Brew_ID to match column name in beers (for merge)
breweries = breweries %>% rename(Brewery_id = Brew_ID)
head(breweries, n = 5)
## Brewery_id Name City State Region
## 1 1 NorthGate Brewing Minneapolis MN North Central
## 2 2 Against the Grain Brewery Louisville KY South
## 3 3 Jack's Abby Craft Lagers Framingham MA Northeast
## 4 4 Mike Hess Brewing Company San Diego CA West
## 5 5 Fort Point Beer Company San Francisco CA West
# merge the data
dfBrews = merge(breweries, beers, by = c("Brewery_id"), all = FALSE)
head(dfBrews, n = 6)
## Brewery_id Name.x City State Region Name.y
## 1 1 NorthGate Brewing Minneapolis MN North Central Pumpion
## 2 1 NorthGate Brewing Minneapolis MN North Central Stronghold
## 3 1 NorthGate Brewing Minneapolis MN North Central Parapet ESB
## 4 1 NorthGate Brewing Minneapolis MN North Central Get Together
## 5 1 NorthGate Brewing Minneapolis MN North Central Maggie's Leap
## 6 1 NorthGate Brewing Minneapolis MN North Central Wall's End
## Beer_ID ABV IBU Style Ounces
## 1 2689 0.060 38 Pumpkin Ale 16
## 2 2688 0.060 25 American Porter 16
## 3 2687 0.056 47 Extra Special / Strong Bitter (ESB) 16
## 4 2692 0.045 50 American IPA 16
## 5 2691 0.049 26 Milk / Sweet Stout 16
## 6 2690 0.048 19 English Brown Ale 16
tail(dfBrews, n = 6)
## Brewery_id Name.x City State Region
## 2405 556 Ukiah Brewing Company Ukiah CA West
## 2406 557 Butternuts Beer and Ale Garrattsville NY Northeast
## 2407 557 Butternuts Beer and Ale Garrattsville NY Northeast
## 2408 557 Butternuts Beer and Ale Garrattsville NY Northeast
## 2409 557 Butternuts Beer and Ale Garrattsville NY Northeast
## 2410 558 Sleeping Lady Brewing Company Anchorage AK West
## Name.y Beer_ID ABV IBU Style Ounces
## 2405 Pilsner Ukiah 98 0.055 NA German Pilsener 12
## 2406 Porkslap Pale Ale 49 0.043 NA American Pale Ale (APA) 12
## 2407 Snapperhead IPA 51 0.068 NA American IPA 12
## 2408 Moo Thunder Stout 50 0.049 NA Milk / Sweet Stout 12
## 2409 Heinnieweisse Weissebier 52 0.049 NA Hefeweizen 12
## 2410 Urban Wilderness Pale Ale 30 0.049 NA English Pale Ale 12
# rename columns in merged data
colnames(dfBrews) = c("Brewery_ID","Brewery_Name","City","State","Region","Beer_Name",
"Beer_ID","ABV","IBU","Style","Ounces")
Heat map of breweries by state
# create new df to play with for heat map
dfBrews2 = dfBrews
dfBrews2$StateName = state.name[match(dfBrews$State, state.abb)]
# head(dfBrews2, n = 5)
# count up the occurance of each state.
BrewMapData = count(dfBrews2, StateName)
# head(BrewMapData, n = 5)
colnames(BrewMapData)[2] = "breweries" # change "n" to "breweries"
# make new column region with lowercase state name
BrewMapData$region <- tolower(BrewMapData$StateName)
# drop the StateName column (first column)
BrewMapData2 = BrewMapData[-1]
# head(BrewMapData2, n = 5)
# get the state info
states = map_data("state")
# head(states, n = 5)
# merge the state info and the brew data
map.df <- merge(states,BrewMapData2, by="region", all.x=T)
# head(map.df, n = 5)
map.df <- map.df[order(map.df$order),]
# create heat map
ggplot(map.df, aes(x=long,y=lat,group=group))+
geom_polygon(aes(fill = breweries))+
geom_path()+
scale_fill_gradientn(colors = brewer.pal(9,"Blues"), na.value="grey90")+
ggtitle("Breweries by State")+
coord_map()
#3. Address the missing values in each column.
# see nulls in merged data
sapply(dfBrews,function(x) sum(is.na(x)))
## Brewery_ID Brewery_Name City State Region Beer_Name
## 0 0 0 0 0 0
## Beer_ID ABV IBU Style Ounces
## 0 62 1005 0 0
# show breakdown of % missing for each state
na_byState <- data.frame()
for(i in st_reg$State){
na_byState[i,1]=length(which(grepl(i,dfBrews$State)))
na_byState[i,2]=length(which(grepl(i,dfBrews$State) & is.na(dfBrews$IBU)))
}
names(na_byState) <- c("Beers_count","IBU_NAs_count")
na_byState %>% mutate(Percent_NA = round(IBU_NAs_count/Beers_count*100,digits=0))
## Beers_count IBU_NAs_count Percent_NA
## AL 10 1 10
## AK 25 8 32
## AZ 47 23 49
## AR 5 4 80
## CA 183 48 26
## CO 265 119 45
## CT 27 21 78
## DE 2 1 50
## FL 58 21 36
## GA 16 9 56
## HI 27 9 33
## ID 30 13 43
## IL 91 52 57
## IN 139 48 35
## IA 30 5 17
## KS 23 4 17
## KY 21 7 33
## LA 19 9 47
## ME 27 20 74
## MD 21 11 52
## MA 82 31 38
## MI 162 124 77
## MN 55 9 16
## MS 11 0 0
## MO 42 13 31
## MT 40 17 42
## NE 25 16 64
## NV 11 3 27
## NH 8 6 75
## NJ 8 0 0
## NM 14 8 57
## NY 74 28 38
## NC 59 29 49
## ND 3 0 0
## OH 49 17 35
## OK 19 8 42
## OR 125 38 30
## PA 100 53 53
## RI 27 7 26
## SC 14 9 64
## SD 7 7 100
## TN 6 1 17
## TX 130 41 32
## UT 26 15 58
## VT 27 10 37
## VA 40 5 12
## WA 68 25 37
## WV 2 0 0
## WI 87 45 52
## WY 15 3 20
## DC 8 4 50
### set seed
set.seed(7)
### impute values for ABV NAs
dfBrews$ABV = ifelse(is.na(dfBrews$ABV),
round(sample((mean(dfBrews$ABV, na.rm = TRUE) - sd(dfBrews$ABV, na.rm = TRUE)):
(mean(dfBrews$ABV, na.rm = TRUE) + sd(dfBrews$ABV, na.rm = TRUE)),
size = sum(is.na(dfBrews$ABV)), replace = T), 0), dfBrews$ABV)
# columns that have null values
colnames(dfBrews)[!complete.cases(t(dfBrews))]
## [1] "IBU"
### impute values for IBU using Naive Bayes
# created editable data set
combined_df <- dfBrews
# split data frame into IBU known and IBU unknown
ibu_known <- combined_df[which(!is.na(combined_df$IBU)),]
ibu_unknown <- combined_df[which(is.na(combined_df$IBU)),]
############# Visualizations ##################################
#correlation between numerical vectors - weak association
ibu_known %>% select_if(is.numeric) %>% cor() %>% corrplot::corrplot()
#visualizing strongest relationship between IBU and categorical values
plot_ly(ibu_known, x= ~reorder(Style,IBU), y= ~IBU) %>%
add_boxplot() %>%
layout(title="IBU by Beer Style")
#visual comparison with ABV and Style (same order as IBU-relationship)
plot_ly(ibu_known, x= ~reorder(Style,IBU), y= ~ABV) %>%
add_boxplot() %>%
layout(title="ABV by Beer Style\nordered by increasing IBU")
###############################################################
# Training nB for classifying IBU
model <- naiveBayes(IBU~., data=ibu_known)
######### external cross validation ###########################
### multiple iterations
iterations = 100
masterAcc = matrix(nrow = iterations)
for(j in 1:iterations){
train <- ibu_known[sample(seq(1:length(ibu_known$IBU)),
round(.7*length(ibu_known$IBU))),]
test <- ibu_known[-sample(seq(1:length(ibu_known$IBU)),
round(.7*length(ibu_known$IBU))),]
pred <- predict(model, train)
t1 <- table(factor(pred, union(pred, train$IBU)),
factor(train$IBU, union(pred, train$IBU)))
CM <- confusionMatrix(t1)
masterAcc[j] = CM$overall[1]
}
colMeans(masterAcc) # average accuracy across the 150 iterations
## [1] 0.8429705
var(masterAcc)[1] # measure of the variance across the 150 iterations
## [1] 3.697411e-05
# CM
# Impute nB
imp <- predict(model, ibu_unknown)
ibu_unknown_nB <- ibu_unknown
for(i in 1:nrow(ibu_unknown_nB)){
ibu_unknown_nB$IBU[i] <- imp[i]
}
# attaching the predictions for unknown to known IBUs
combined_df_nB <- rbind(ibu_known,ibu_unknown_nB)
combined_df_nB <- combined_df_nB[order(combined_df_nB$Brewery_ID),]
head(combined_df_nB, n = 5)
## Brewery_ID Brewery_Name City State Region Beer_Name
## 1 1 NorthGate Brewing Minneapolis MN North Central Pumpion
## 2 1 NorthGate Brewing Minneapolis MN North Central Stronghold
## 3 1 NorthGate Brewing Minneapolis MN North Central Parapet ESB
## 4 1 NorthGate Brewing Minneapolis MN North Central Get Together
## 5 1 NorthGate Brewing Minneapolis MN North Central Maggie's Leap
## Beer_ID ABV IBU Style Ounces
## 1 2689 0.060 38 Pumpkin Ale 16
## 2 2688 0.060 25 American Porter 16
## 3 2687 0.056 47 Extra Special / Strong Bitter (ESB) 16
## 4 2692 0.045 50 American IPA 16
## 5 2691 0.049 26 Milk / Sweet Stout 16
#4. Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
sapply(combined_df_nB,function(x) sum(is.na(x)))
## Brewery_ID Brewery_Name City State Region Beer_Name
## 0 0 0 0 0 0
## Beer_ID ABV IBU Style Ounces
## 0 0 0 0 0
BrewBeerABV = data.frame(combined_df_nB %>% group_by(State, Region) %>% summarise(ABV=median(ABV)))
BrewBeerABV %>% ggplot(mapping=aes(x=State,y=ABV,fill=State)) + geom_bar(stat="identity",width=0.3,position="dodge") +
theme(legend.position="none",axis.text.x=element_text(angle=45,vjust=0.1))+
ggtitle("Median ABV by State") + ylab("Median ABV") + xlab("State") + facet_wrap(~Region, scale = "free")
BrewBeerIBU = data.frame(combined_df_nB %>% group_by(State, Region) %>% summarise(IBU=median(IBU)))
BrewBeerIBU %>% ggplot(mapping=aes(x=State,y=IBU,fill=State)) + geom_bar(stat="identity",width=0.3,position="dodge") +
theme(legend.position="none",axis.text.x=element_text(angle=45,vjust=0.1))+
ggtitle("Median IBU by State") + ylab("Median IBU") + xlab("State") + facet_wrap(~Region, scale = "free")
#5. Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
combined_df_nB[which.max(combined_df_nB$ABV),]
## Brewery_ID Brewery_Name City State Region
## 384 52 Upslope Brewing Company Boulder CO West
## Beer_Name Beer_ID ABV IBU
## 384 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale 2565 0.128 48
## Style Ounces
## 384 Quadrupel (Quad) 19.2
#Colorado at 12.8%
combined_df_nB[which.max(combined_df_nB$IBU),]
## Brewery_ID Brewery_Name City State Region
## 1857 375 Astoria Brewing Company Astoria OR West
## Beer_Name Beer_ID ABV IBU Style
## 1857 Bitter Bitch Imperial IPA 980 0.082 138 American Double / Imperial IPA
## Ounces
## 1857 12
#Oregon at 138
The state with the ABV is Colorado with a value of 0.128. And the state with the highest IBU is Oregon with an ABV of 138. Neither of these values came from the imputed data (i.e. they were not previously NA values).
#6. Comment on the summary statistics and distribution of the ABV variable.
# ABV quantiles for median ABV values
quantile(BrewBeerABV$ABV)
## 0% 25% 50% 75% 100%
## 0.0275 0.0540 0.0555 0.0580 0.0625
# distribution of ABV
ggplot(data = dfBrews) +
geom_histogram(mapping = aes(x = ABV), fill = "blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## ABV distribution is slightly right skewed
# summary statistics for ABV
summary(dfBrews$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.05000 0.05600 0.05824 0.06700 0.12800
# violin plot
## create new df for violin plot
combined_df_nB_2 = combined_df_nB
## create new "hold" column to put 1 value for violin plot to work
combined_df_nB_2$hold = 1
## check the data
head(combined_df_nB_2)
## Brewery_ID Brewery_Name City State Region Beer_Name
## 1 1 NorthGate Brewing Minneapolis MN North Central Pumpion
## 2 1 NorthGate Brewing Minneapolis MN North Central Stronghold
## 3 1 NorthGate Brewing Minneapolis MN North Central Parapet ESB
## 4 1 NorthGate Brewing Minneapolis MN North Central Get Together
## 5 1 NorthGate Brewing Minneapolis MN North Central Maggie's Leap
## 6 1 NorthGate Brewing Minneapolis MN North Central Wall's End
## Beer_ID ABV IBU Style Ounces hold
## 1 2689 0.060 38 Pumpkin Ale 16 1
## 2 2688 0.060 25 American Porter 16 1
## 3 2687 0.056 47 Extra Special / Strong Bitter (ESB) 16 1
## 4 2692 0.045 50 American IPA 16 1
## 5 2691 0.049 26 Milk / Sweet Stout 16 1
## 6 2690 0.048 19 English Brown Ale 16 1
## violin plot
combined_df_nB_2 %>% ggplot(aes(x = hold, y = ABV)) + geom_violin(fill = "blue") +
xlab("All Beers ABV Distribution") + ylab("") + scale_x_discrete(labels = NULL)
zeros <- sum(ifelse(combined_df_nB$ABV==0,1,0))
The ABV variable had 62 imputed zeros. The error rate for this is 2.57%. Additionally, 75% of the ABV falls between 5% - 6.7%. A general note of interest is that Budweiser has an ABV of 5%.
#7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.
################## BY STATE #########################
# IBU
quantile(BrewBeerIBU$IBU)
## 0% 25% 50% 75% 100%
## 9 28 36 42 88
####### quick scatterplot ##########################
ggplot(data = combined_df_nB) +
geom_point(mapping = aes(x = ABV, y = IBU), color = "blue")
# linear model fit
lm_fit = lm(ABV ~ IBU, data = combined_df_nB)
combined_df_nB %>% ggplot(aes(x=ABV,y=IBU,color=State)) +
geom_point() +
ggtitle("ABV vs IBU by State") +
theme(legend.position = "none") +
theme(legend.title = element_blank()) +
geom_smooth(method="lm",se=FALSE,color="black",size=0.1)
## `geom_smooth()` using formula 'y ~ x'
####### relationship between ABV and IBU ####################
combined_df_nB %>% ggplot(aes(x=ABV, y=IBU, color=ABV)) +
geom_point() +
geom_smooth() +
labs(color="Ratio of ABV vs IBU") +
ggtitle("ABV vs IBU")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#7 continued
########### ABV and IBU relationship by ALE / IPA #################
#check to be sure we aren't missing anything by looking for IPA
#check for 'India Pale Ale' instead of 'IPA'
sum(grepl("India Pale Ale",combined_df_nB$Style)) - sum(grepl("India Pale Ale",combined_df_nB$Style) & grepl("IPA", combined_df_nB$Style))
## [1] 0
#isolate IPAs & non-IPA Ales / note beer-style as iapsType
ipas = filter(combined_df_nB, grepl("Ale|IPA",Style))
ipas = ipas %>% mutate(ipasType = ifelse(grepl("IPA",Style),"IPA","Ale"))
ipas %>% ggplot(aes(x=ABV *100, y= IBU, color=ipasType)) +
geom_point(position='jitter') +
geom_smooth() +
ggtitle('ABV vs IBU') +
labs(subtitle="Ale and IPAs")+
xlab('ABV (%)') +
theme(legend.title=element_blank())
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
quantile(combined_df_nB$IBU)
## 0% 25% 50% 75% 100%
## 1 20 35 65 138
We can see that there is a positive correlation between IBU and ABV. We can see a big cluster around 5% ABV, and we predict that these are regular 12 ounces of beers which, according to NIAAA in the U.S., contain between 4-7% ABV, with the average being 5%. Anything above estimates to be malt liquor which averages to be 7% ABV. It is worth noting that correlation does not equal causation. Next, we decided to compare ABV and IBU by Ale and IPA. We can see that the majority of Ales have low ABV and IBU while the majority of IPAs have high ABV and IBU. Budweiser has average 5% ABV and 7 IBU; therefore, we may carefully suggest focusing on products with ABVs within range of 5.6 to 5.8% and IBUs of 19 to 64. This range might be more competitive in the existing market.
#8. Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually.
#In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may also feel free to supplement your response to this question with any other methods or techniques you have learned. Creativity and alternative solutions are always encouraged.
################################################################################
ipas = filter(combined_df_nB, grepl("Ale|IPA",Style))
ipas = ipas %>% mutate(ipasType = ifelse(grepl("IPA",Style),"IPA","Ale"))
set.seed(7)
splitPerc = 0.7
trainIndices = sample(1:dim(ipas)[1],round(splitPerc*dim(ipas)[1]))
train = ipas[trainIndices,]
test = ipas[-trainIndices,]
classifications = knn(train[,c(8,9)],test[,c(8,9)],as.factor(train$ipasType),prob=TRUE,k=5)
# str(train)
set.seed(7)
splitPerc = 0.7
iterations = 100
nums = 100
masterAcc = matrix(nrow = iterations, ncol = nums)
for (j in 1:iterations)
{
accs = data.frame(accuracy = numeric(100), k=numeric(100))
trainIndices = sample(1:dim(ipas)[1],round(splitPerc*dim(ipas)[1]))
train = ipas[trainIndices,]
test = ipas[-trainIndices,]
for (i in 1:nums)
{
classifications = knn(train[,c(8,9)],test[,c(8,9)],as.factor(train$ipasType),prob=TRUE,k=i)
CM = confusionMatrix(table(as.factor(test$ipasType),classifications))
masterAcc[j,i] = CM$overall[1]
}
}
CM
## Confusion Matrix and Statistics
##
## classifications
## Ale IPA
## Ale 246 40
## IPA 36 138
##
## Accuracy : 0.8348
## 95% CI : (0.7976, 0.8676)
## No Information Rate : 0.613
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6503
##
## Mcnemar's Test P-Value : 0.7308
##
## Sensitivity : 0.8723
## Specificity : 0.7753
## Pos Pred Value : 0.8601
## Neg Pred Value : 0.7931
## Prevalence : 0.6130
## Detection Rate : 0.5348
## Detection Prevalence : 0.6217
## Balanced Accuracy : 0.8238
##
## 'Positive' Class : Ale
##
## masterAcc # removed from knit
meanAcc=colMeans(masterAcc)
# plot k-values v. average accuracy
{plot(seq(1,nums,1), meanAcc,type="l", xlab = "Values for K",
ylab = "Average Accuracy")
abline(v=which.max(meanAcc),col="red",lwd=1)
abline(h=max(meanAcc),col="red",lwd=1)}
# best values for K and highest accuracy
## which.max(meanAcc) # 38 is best value for K
## max(meanAcc) # 81.8% is highest accuracy
# internal cross validation for knn
classifications = knn.cv(ipas[,c(8,9)], ipas$ipasType, prob = TRUE, k = 38)
CM <- confusionMatrix(table(classifications, as.factor(ipas$ipasType)))
CM
## Confusion Matrix and Statistics
##
##
## classifications Ale IPA
## Ale 822 140
## IPA 141 431
##
## Accuracy : 0.8168
## 95% CI : (0.7965, 0.8359)
## No Information Rate : 0.6278
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6082
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8536
## Specificity : 0.7548
## Pos Pred Value : 0.8545
## Neg Pred Value : 0.7535
## Prevalence : 0.6278
## Detection Rate : 0.5359
## Detection Prevalence : 0.6271
## Balanced Accuracy : 0.8042
##
## 'Positive' Class : Ale
##
# Accuracy = 81.8%
# Sensitivity = 85.4%
# Specificity = 75.7%
From external cross-validations, the optimal number of nearest neighbors is 38. From this, we checked our accuracy, sensitivity, and specificity of predicting IPA versus non-IPA ales. The accuracy for predicting IPA with internal cross-validation based on ABV and IBU was found to be 81.68%, the specificity was 75.48%, and the sensitivity was 85.36%.
#9
## table(combined_df_nB$Style) #checking list of styles
sum(ifelse(combined_df_nB$Style=="",1,0)) #finding the unidentified styles
## [1] 5
style_byState <- combined_df_nB %>% select(Style, ABV, IBU, State, Region)
#replacing empty strings with 'unknown'
style_byState$Style <- replace(style_byState$Style,
which(style_byState==""),
"Unknown")
## table(style_byState$Style) #checking update
#creating the dataframe for State-preferences
style_byState %>% group_by(State) %>%
summarize(Style = names(which.max(table(Style))),
ABV = mean(ABV), #could switch to median if needed
IBU = mean(IBU))
## # A tibble: 51 x 4
## State Style ABV IBU
## <chr> <chr> <dbl> <dbl>
## 1 AK American IPA 0.0556 34.4
## 2 AL American IPA 0.062 50.1
## 3 AR American Amber / Red Ale 0.052 25
## 4 AZ American IPA 0.0564 37.8
## 5 CA American IPA 0.0607 44.8
## 6 CO American IPA 0.0598 47.0
## 7 CT American Amber / Red Ale 0.0611 42.0
## 8 DC American Blonde Ale 0.0656 54.9
## 9 DE American IPA 0.0275 50.5
## 10 FL American IPA 0.0579 41.4
## # ... with 41 more rows
#creating the summary for grpahing
summary_byState <- style_byState %>%
group_by(State) %>%
summarize(Style = names(which.max(table(Style))),
ABV = median(ABV),
IBU = median(IBU))
summary_byState <- summary_byState %>% dplyr::rename(state=State)
beer_colors <- c("#FFCCFF", "#CC6633", "#993300",
"#330000", "#FFCC66", "#663300",
"#006699", "#3399FF", "#99CCFF",
"#660066", "#66FF00", "#006600")
plot_usmap(regions="states",
data=summary_byState,
values="Style",labels=F,offset=0.5, color="white") +
theme(legend.position="bottom",
legend.title=element_blank()) +
labs(title = "Preferred Beer Style\nby State") +
scale_fill_manual(values=beer_colors)
The graphic here depicts majority preference by state for certain styles of beers. When ties occurred, we gave the win to the most prevalent beer-type nationally. The focus of our analysis here was to present a by-state retail option. With additional supply-chain information, we may be able to help optimize future distribution or optimization efforts.